Libraries

library(tidyverse)
library(tidymodels)
library(tidytext)
library(plotly)
library(ggpubr)
library(GGally)
library(embed)
library(here)

Load Data

Processed tracks

tracks_summary_file <- here("data", "processed", "tracks_summary.tsv")
tracks_summary_df <- read_tsv(tracks_summary_file, show_col_types = FALSE) %>% 
  mutate(
    across(contains("id"), as.character),
    divided = as.logical(divided), 
    across(contains("filamented_"), ~factor(
        .x,
        levels = c(FALSE, TRUE),
        labels = c("Not filamented", "Filamented")
      )
    ),
    across(contains("survived_"), ~factor(
        .x,
        levels = c(FALSE, TRUE),
        labels = c("Not survived", "Survived")
      )
    ),
    cell_status = paste0(filamented_track, " - ", survived_strict) %>% 
      factor()
  ) %>%
  relocate(where(is.character), where(is.factor), where(is.logical)) %>% 
  filter(initial_time <= centered_antibiotic_start_time, end_time > centered_antibiotic_start_time) %>% 
  glimpse()
## Rows: 13,524
## Columns: 28
## $ experiment_id                               <chr> "Chromosome", "Chromosome"…
## $ id                                          <chr> "xy01_1_10.008-34.000", "x…
## $ filamented_track                            <fct> Filamented, Filamented, Fi…
## $ survived_strict                             <fct> Survived, Survived, Not su…
## $ survived_soft                               <fct> Survived, Survived, Surviv…
## $ cell_status                                 <fct> Filamented - Survived, Fil…
## $ exists_before_experiment                    <lgl> TRUE, TRUE, FALSE, FALSE, …
## $ exists_before_or_during_experiment          <lgl> TRUE, TRUE, TRUE, TRUE, TR…
## $ exists_after_experiment                     <lgl> TRUE, TRUE, TRUE, TRUE, TR…
## $ divided                                     <lgl> TRUE, TRUE, TRUE, TRUE, TR…
## $ initial_time                                <dbl> 0, 10, 60, 60, 0, 0, 0, 0,…
## $ end_time                                    <dbl> 240, 160, 170, 240, 170, 1…
## $ lived_time                                  <dbl> 240, 150, 110, 180, 170, 1…
## $ centered_antibiotic_start_time              <dbl> 60, 60, 60, 60, 60, 60, 60…
## $ centered_antibiotic_end_time                <dbl> 100, 100, 100, 100, 100, 1…
## $ sos_at_time                                 <dbl> 20, 10, 50, 40, 30, 70, 70…
## $ dead_at_time                                <dbl> NA, 170, 180, NA, 180, 170…
## $ initial_length                              <dbl> 25.84965, 41.41517, 30.751…
## $ initial_gfp                                 <dbl> 0.9977303, 0.9980406, 0.99…
## $ initial_ds_red                              <dbl> 2.987506, 2.987671, 3.0091…
## $ end_length                                  <dbl> 56.78846, 134.41667, 107.4…
## $ end_gfp                                     <dbl> 0.9979626, 0.9963134, 0.99…
## $ end_ds_red                                  <dbl> 2.983168, 2.980438, 2.9792…
## $ sos_length                                  <dbl> 41.41517, 41.41517, 49.549…
## $ sos_gfp                                     <dbl> 0.9980406, 0.9980406, 0.99…
## $ sos_ds_red                                  <dbl> 2.987671, 2.987671, 3.0841…
## $ n_divisons                                  <dbl> 7, 1, 1, 7, 4, 2, 3, 5, 1,…
## $ time_since_last_divison_to_experiment_start <dbl> 50, 50, NA, NA, 10, 50, 50…

Exploratory Data Analysis

Set default plot style

theme_set(
  theme_bw() +
  theme(legend.position = "top")
)

Common factor conversion

parse_metrics_column <- function(.data, metric_column) {
  .data %>% 
    mutate(
      {{ metric_column }} := str_remove(
        string = {{ metric_column }},
        pattern = "_(.+)"
      ) %>% 
        factor(
          levels = c("initial", "sos", "end"),
          labels = c("Initial", "SOS", "End")
        )
    )
}

Cells distribution across experiments

tracks_summary_df %>% 
  count(experiment_id, cell_status) %>%
  group_by(experiment_id) %>% 
  mutate(
    percentage = n / sum(n) * 100,
    ymax = cumsum(percentage),
    ymin = c(0, head(ymax, -1)),
    labels = paste0(format(percentage, digits = 2), "%"),
    labels_position = (ymax + ymin) / 2,
    total_label = paste0("Total:\n", format(sum(n), big.mark = ","), " cells")
  ) %>% 
  ungroup() %>% 
  identity() %>% 
  ggplot(
    aes(
      ymin = ymin,
      ymax = ymax,
      xmin = 3,
      xmax = 4
    )
  ) +
  geom_rect(
    size = 1.5,
    color = "white",
    aes(fill = cell_status)
  ) +
  geom_label(
    x = 2,
    aes(
      y = labels_position,
      label = labels
    ),
    label.size = NA,
    size = 3.5
  ) +
  geom_text(aes(x = -Inf, y = -Inf, label = total_label), hjust = 0.5, vjust = 0.5) +
  facet_grid(. ~ experiment_id) +
  coord_polar(theta = "y") +
  xlim(c(-1, 4)) +
  guides(
    fill = guide_legend(ncol = 2)
  ) +
  theme_void() +
  theme(
    legend.position = "bottom"
  ) +
  labs(
    fill = "Cell status"
  ) +
  NULL

Temporal metrics distribution

Scaled density GFP

tracks_summary_df %>% 
  select(-sos_gfp) %>% 
  pivot_longer(
    cols = contains("gfp"),
    names_to = "metric",
    values_to = "value"
  ) %>% 
  parse_metrics_column(metric) %>% 
  filter(!is.na(value)) %>% 
  ggplot(aes(x = value, color = cell_status, fill = cell_status)) +
  geom_density(aes(y = ..scaled..), alpha = 1 / 4) +
  facet_grid(metric ~ experiment_id) +
  scale_y_continuous(
    labels = scales::percent
  ) +
  theme(
    panel.spacing.y = unit(0.5, "lines")
  ) +
  guides(
    color = guide_legend(ncol = 2),
    fill = guide_legend(ncol = 2)
  ) +
  labs(
    x = "Normalized mean GFP (log10)",
    y = "Scaled density",
    color = "Cell status",
    fill = "Cell status"
  ) +
  NULL

Scaled density Length

tracks_summary_df %>% 
  select(-sos_length) %>% 
  pivot_longer(
    cols = contains("length"),
    names_to = "metric",
    values_to = "value"
  ) %>% 
  parse_metrics_column(metric) %>% 
  filter(!is.na(value)) %>% 
  ggplot(aes(x = value, color = cell_status, fill = cell_status)) +
  geom_density(aes(y = ..scaled..), alpha = 1 / 4) +
  facet_grid(metric ~ experiment_id) +
  scale_y_continuous(
    labels = scales::percent
  ) +
  coord_cartesian(xlim = c(0, 150)) +
  theme(
    panel.spacing.y = unit(0.5, "lines")
  ) +
  guides(
    color = guide_legend(ncol = 2),
    fill = guide_legend(ncol = 2)
  ) +
  labs(
    x = "Cell length",
    y = "Scaled density",
    color = "Cell status",
    fill = "Cell status"
  ) +
  NULL

But normally we only have two availble variables…

tracks_summary_df %>% 
  ggplot(aes(x = initial_gfp, y = initial_length, color = cell_status)) +
  geom_point(alpha = 1/20) +
  facet_grid(~experiment_id) +
  guides(
    color = guide_legend(ncol = 2, override.aes = list(alpha = 1)),
    fill = guide_legend(ncol = 2)
  ) +
  labs(
    x = "Initial normalized GFP (log10)",
    y = "Initial length",
    color = "Cell status"
  ) +
  NULL

tracks_summary_df %>% 
  ggplot(aes(x = end_gfp - initial_gfp, y = end_length - initial_length, color = cell_status)) +
  geom_point(alpha = 1/20) +
  facet_grid(~experiment_id) +
  guides(
    color = guide_legend(ncol = 2, override.aes = list(alpha = 1)),
    fill = guide_legend(ncol = 2)
  ) +
  labs(
    x = "End normalized GFP - Initial normalized GFP (log10)",
    y = "End length - Initial length",
    color = "Cell status"
  ) +
  NULL

Life time

Classes distribution

tracks_summary_df %>% 
  ggplot(aes(x =factor(lived_time), fill = cell_status)) +
  geom_bar(position = "fill", stat = "count", width = 1) +
  facet_grid(experiment_id ~ .) +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0), labels = scales::percent) +
  guides(
    color = guide_legend(ncol = 2),
    fill = guide_legend(ncol = 2)
  ) +
  theme(
    panel.spacing.y = unit(0.9, "lines"),
    panel.grid = element_blank()
  ) +
  labs(
    x = "Cell life time",
    y = "Percentage of cells",
    fill = "Cell status"
  ) +
  NULL

Cleveland dot plot of GFP

tracks_summary_df %>% 
  select(experiment_id, id, lived_time, filamented_track, survived_strict, contains("gfp")) %>% 
  rename(survived = contains("survived")) %>% 
  pivot_longer(
    cols = contains("gfp"),
    names_to = "metric"
  ) %>% 
  filter(!is.na(value)) %>% 
  group_by(experiment_id, lived_time, filamented_track, survived, metric) %>% 
  summarize(
    value = mean(value),
    .groups = "drop"
  ) %>%
  parse_metrics_column(metric) %>% 
  identity() %>% 
  ggplot(aes(x = lived_time, y = value)) +
  geom_line(aes(group = lived_time)) +
  geom_point(aes(color = metric, shape = survived)) +
  facet_grid(filamented_track ~ experiment_id) +
  scale_color_hue(direction = -1, h.start = 90) +
  scale_fill_hue(direction = -1, h.start = 90) +
  labs(
    x = "Cell life time",
    y = "Average normalized GFP value",
    color = "Length type",
    shape = "Cell status"
  ) +
  theme(
    panel.spacing.x = unit(0.9, "lines"),
  ) +
  NULL

Cleveland dot plot of length

tracks_summary_df %>% 
  select(experiment_id, id, lived_time, filamented_track, survived_strict, contains("length")) %>% 
  rename(survived = contains("survived")) %>% 
  pivot_longer(
    cols = contains("length"),
    names_to = "metric"
  ) %>% 
  filter(!is.na(value)) %>% 
  group_by(experiment_id, lived_time, filamented_track, survived, metric) %>% 
  summarize(
    value = mean(value),
    .groups = "drop"
  ) %>%
  parse_metrics_column(metric) %>% 
  identity() %>% 
  ggplot(aes(x = lived_time, y = value)) +
  geom_line(aes(group = lived_time)) +
  geom_point(aes(color = metric, shape = survived)) +
  facet_grid(filamented_track ~ experiment_id) +
  scale_color_hue(direction = -1, h.start = 90) +
  scale_fill_hue(direction = -1, h.start = 90)  +
  labs(
    x = "Cell life time",
    y = "Average length value",
    color = "Length type",
    shape = "Cell status"
  ) +
  theme(
    panel.spacing.x = unit(0.9, "lines"),
  ) +
  NULL

Division

Number of divisions

tracks_summary_df %>% 
  group_by(experiment_id, cell_status) %>% 
  summarise(
    n = median(n_divisons),
    .groups = "drop"
  ) %>% 
  ggplot(aes(x = fct_reorder(cell_status, n), y = n, color = cell_status, fill = cell_status)) +
  geom_point() +
  geom_segment(aes(xend = cell_status, y=0, yend = n)) +
  facet_grid(. ~ experiment_id) +
  theme(
    panel.grid.major = element_blank(),
    legend.position = "none"
  ) +
  labs(
    x = "Cell status",
    y = "Median number of division"
  ) +
  coord_flip() +
  NULL

Time since last division

tracks_summary_df %>% 
  filter(!is.na(time_since_last_divison_to_experiment_start)) %>% 
  ggplot(aes(x = factor(time_since_last_divison_to_experiment_start), fill = cell_status)) +
  geom_bar(position = "dodge", stat = "count", color = "white") +
  facet_grid(experiment_id ~ ., scales = "free_y") +
  #scale_x_discrete(expand = c(0, 0)) +
  #scale_y_continuous(expand = c(0, 0)) +
  guides(
    color = guide_legend(ncol = 2),
    fill = guide_legend(ncol = 2)
  ) +
  theme(
    panel.spacing.y = unit(0.9, "lines"),
    panel.grid.major = element_blank()
  ) +
  labs(
    x = "Time since last division to experiment start",
    y = "Count of cells",
    fill = "Cell status"
  ) +
  NULL

PCA

Split datasets

experiment_datasets <- tracks_summary_df %>% 
  select(experiment_id, cell_status, contains("length"), contains("gfp"), -contains("sos"), -contains("end")) %>% 
  group_by(experiment_id) %>% 
  {
    grouped_data <- .
    group_split(grouped_data) %>% 
      set_names(nm = group_keys(grouped_data) %>% pull())
  } %>% 
  map(select, -experiment_id) %>% 
  identity()

chromosome_df <- experiment_datasets$Chromosome
plasmid_df <- experiment_datasets$Plasmid

Chromosome

c_pca_rec <- recipe(cell_status ~ ., data = chromosome_df) %>% 
  step_naomit(all_predictors()) %>% 
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

c_pca_prep <- prep(c_pca_rec)
c_pca_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          2
## 
## Training data contained 1061 data points and no missing data.
## 
## Operations:
## 
## Removing rows with NA values in all_predictors()
## Centering and scaling for initial_length, initial_gfp [trained]
## PCA extraction with initial_length, initial_gfp [trained]
c_tidied_pca <- tidy(c_pca_prep, 3)

c_tidied_pca %>%
  filter(component %in% paste0("PC", 1:5)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1) +
  labs(y = NULL)

c_pca_prep %>% 
  juice() %>% 
  ggplot(aes(x = PC1, y = PC2, color = cell_status)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = 0) +
  geom_jitter(size = 0.7) +
  guides(
    color = guide_legend(ncol = 2, override.aes = list(alpha = 1)),
    fill = guide_legend(ncol = 2)
  ) +
  labs(
    color = "Cell status"
  ) +
  NULL

Plasmid

p_pca_rec <- recipe(cell_status ~ ., data = plasmid_df) %>% 
  step_naomit(all_predictors()) %>% 
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

p_pca_prep <- prep(p_pca_rec)
p_pca_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          2
## 
## Training data contained 12463 data points and no missing data.
## 
## Operations:
## 
## Removing rows with NA values in all_predictors()
## Centering and scaling for initial_length, initial_gfp [trained]
## PCA extraction with initial_length, initial_gfp [trained]
p_tidied_pca <- tidy(p_pca_prep, 3)

p_tidied_pca %>%
  filter(component %in% paste0("PC", 1:5)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1) +
  labs(y = NULL)

p_pca_prep %>% 
  juice() %>% 
  ggplot(aes(x = PC1, y = PC2, color = cell_status)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = 0) +
  geom_jitter(size = 0.1, alpha = 1/3) +
  scale_x_continuous(limits = c(NA,  5)) +
  scale_y_continuous(limits = c(NA, 7)) +
  guides(
    color = guide_legend(ncol = 2, override.aes = list(alpha = 1, size = 1)),
    fill = guide_legend(ncol = 2)
  ) +
  labs(
    color = "Cell status"
  ) +
  NULL

UMAP

p_umap_rec <- recipe(cell_status ~ ., data = plasmid_df) %>% 
  step_naomit(all_predictors()) %>% 
  step_normalize(all_predictors()) %>%
  step_umap(all_predictors())

p_umap_prep <- prep(p_umap_rec)
p_umap_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          2
## 
## Training data contained 12463 data points and no missing data.
## 
## Operations:
## 
## Removing rows with NA values in all_predictors()
## Centering and scaling for initial_length, initial_gfp [trained]
## UMAP embedding for initial_length, initial_gfp [trained]
juice(p_umap_prep) %>%
  ggplot(aes(umap_1, umap_2, label )) +
  geom_point(aes(color = cell_status), alpha = 0.7, size = 2) +
  labs(
    x = "UMAP 1",
    y = "UMAP 2",
    color = "Cell status"
  ) +
  guides(
    color = guide_legend(ncol = 2),
    fill = guide_legend(ncol = 2)
  ) +
  NULL

Chromosome

c_umap_rec <- recipe(cell_status ~ ., data = chromosome_df) %>% 
  step_naomit(all_predictors()) %>% 
  step_normalize(all_predictors()) %>%
  step_umap(all_predictors())

c_umap_prep <- prep(c_umap_rec)
c_umap_prep
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          2
## 
## Training data contained 1061 data points and no missing data.
## 
## Operations:
## 
## Removing rows with NA values in all_predictors()
## Centering and scaling for initial_length, initial_gfp [trained]
## UMAP embedding for initial_length, initial_gfp [trained]
juice(c_umap_prep) %>% 
  ggplot(aes(umap_1, umap_2, label )) +
  geom_point(aes(color = cell_status), alpha = 0.7, size = 2) +
  guides(
    color = guide_legend(ncol = 2),
    fill = guide_legend(ncol = 2)
  ) +
  labs(
    x = "UMAP 1",
    y = "UMAP 2",
    color = "Cell status"
  )

Paired matrics

Chromosome

chromosome_df %>% 
  select(cell_status, where(is.numeric)) %>% 
  rename(`Initial GFP` = initial_gfp, `Initial length` = initial_length) %>% 
  ggpairs(
    data = .,
    columns = 2:ncol(.),
    aes(color = cell_status, alpha = 1/1000)
  )

Plasmid

plasmid_df %>% 
  select(cell_status, where(is.numeric)) %>% 
  rename(`Initial GFP` = initial_gfp, `Initial length` = initial_length) %>% 
  ggpairs(
    data = .,
    columns = 2:ncol(.),
    aes(color = cell_status, alpha = 1/1000),
    #upper = list(continuous = "points")
  )